source("../../R/mixture_irt_md.R")

library(tidyverse)
library(multidplyr)
library(dplyr, warn.conflicts = FALSE)
library(tictoc)


set.seed(123)

# Pull in mega-survey dataset and arrange into long format
if (!exists("long_data")) {
  load("../../../bigsurveys_recoded_plus_estimates4.RData")
  long_data <- data %>%
    select(source, respondent, pid7, cces2006_minimumwage:cces2017_buyamerican) %>%
    select(-cces2012_immigration5, -cces2012_immigration6) %>% # Drop redundant immigration items in CCES2012
    pivot_longer( cols = cces2006_minimumwage:cces2017_buyamerican,
                  names_to="item",
                  values_to="choice") %>%
    filter(!is.na(choice)) 
}
glimpse(long_data)


# Do a little checking of the sample sizes and number of items per survey
long_data %>% 
  filter(source %in% sprintf("CCES_%i", 2012:2018)) %>%
  group_by(source, item) %>%
  mutate(n=n()) %>%
  filter(n>4999) %>% # Drop the "module" items ask of fewer respondents
  group_by(source) %>%
  mutate(source_n = length(unique(respondent))) %>%
  group_by(source, item) %>%
  summarize(n=n(), 
            source_n=source_n[1], 
            .groups="drop") %>%
  group_by(source) %>%
  summarize(n_items = n(),
            source_n = source_n[1],
            min_responses = min(n),
            max_responses = max(n))


# Global container to hold results
all_res <- list()

# function to model to each survey dataset
fit_em_mix_irt_2d <- function(long_dat, ndim=1, ...) {
  tic()
  cat(sprintf("Working on %s...\n", long_dat$source[1]))
  wide_dat <- long_dat %>%
    group_by(source, item) %>%
    filter(n() > 4999) %>%              # Drop "module" items (items asked of fewer than 5k respondents)
    group_by(source, respondent) %>%
    #filter(n() > 18) %>%                # Drop respondents answering fewer than 19 items.
    pivot_wider(id_cols = c("source", "respondent", "pid7"),
                names_from=item,
                values_from=choice) %>%
    ungroup()
  item_summary <- tibble(item = names(wide_dat %>% select(-source, -respondent, -pid7)),
                       number_responses = wide_dat %>% select(-source, -respondent, -pid7) %>% 
                              apply(2,function(x) sum(!is.na(x))),
                       proportion_ones  = wide_dat %>% select(-source, -respondent, -pid7) %>% 
                              apply(2,function(x) sum(x==1, na.rm=TRUE))/number_responses,
                       invalid_responses = wide_dat %>% select(-source, -respondent, -pid7) %>% 
                              apply(2,function(x) sum(!(is.na(x) | x %in% 0:1)))) %>%
                  mutate(item_number = 1:n()) %>%
                  select(item_number, everything())
  cat("Item summary:\n")
  print(item_summary)
  cat("\n")
  res <- em_mix_irt(wide_dat %>% select(-source, -respondent, -pid7) %>% as.matrix(),
                    ndim=ndim, ...)
  # Set polarity for the first recovered IRT dimension
  flip_sign <- sign(cor(as.numeric(wide_dat$pid7),
                        res$irt$x[,1],
                        use="pairwise.complete.obs"))
  res$item_summary <- item_summary
  all_res[[wide_dat$source[[1]]]] <<- res
  wide_dat <- wide_dat %>%
    mutate(lk1 = res$irt$lk,
           lk2 = res$ivp$lk,
           w1  = res$w[,1],
           w2  = res$w[,2],
           w3  = res$w[,3])
  for (j in 1:ndim) {
    wide_dat[[sprintf("x%i", j)]] <- ifelse(j==1, flip_sign, 1)*res$irt$x[,j] 
  }
  toc()
  wide_dat
}

# Run em_mix_irt on each survey...
resulting_data <- long_data %>% 
  filter(source %in% sprintf("CCES_%i", 2012:2018)) %>%
  group_by(source) %>%
  do(fit_em_mix_irt_2d(., ndim=1, xsamp=5000, iters=100, irt_iters=100)) %>%
  bind_rows() 

# A little check...
cor(resulting_data$x1, 
    as.numeric(resulting_data$pid7), 
    use="complete.obs")
      
# Save results...
save(all_res, file="all_results_1d.RData")
appended_data <- resulting_data %>%
   # Merge addition survey variables back in
   left_join(data %>% 
               select(source, respondent, abb:local_economy),
               by=c("source", "respondent")) 
save(appended_data, file="bigsurveys_recoded_plus_estimates_1d.RData")


